import pandas as pd
import numpy as np
from datetime import datetime
# plot
import matplotlib.pyplot as plt
import seaborn as sns
# warnings
import warnings; warnings.filterwarnings("ignore")
# data
# https://www.kaggle.com/datasets/julienjta/nyc-taxi-traffic
path = 'dataset.csv'
NYtaxi = pd.read_csv(path)
r, c = NYtaxi.shape
print(f"This dataset has {r} rows and {c} columns")
This dataset has 10320 rows and 3 columns
NYtaxi = NYtaxi.drop('Unnamed: 0', axis=1)
# rename to ds and y for prophet model
NYtaxi.columns = ['ds','y']
NYtaxi.head() # data are ordered by time
| ds | y | |
|---|---|---|
| 0 | 2014-07-01 00:00:00 | 10844 |
| 1 | 2014-07-01 00:30:00 | 8127 |
| 2 | 2014-07-01 01:00:00 | 6210 |
| 3 | 2014-07-01 01:30:00 | 4656 |
| 4 | 2014-07-01 02:00:00 | 3820 |
# let's see the distribution of data
NYtaxi.y.hist();
start, end = NYtaxi['ds'].min(), NYtaxi['ds'].max()
days = int(r/(24*2))
print(f"This dataset starts from {start} and ends in {end}, total {days} days")
This dataset starts from 2014-07-01 00:00:00 and ends in 2015-01-31 23:30:00, total 215 days
NYtaxi.isna().mean()
ds 0.0 y 0.0 dtype: float64
NYtaxi.duplicated().sum()
0
Data come as tidy data, that is good.
# define the length of a period
freq = 24*2
Based on my experience, the number of taxi passenger usually vary from hour to hour rahther than day to day, so I use a single day as the window size.
# Define a function to count anomalies
def anomalies(NYtaxi):
upper = sum(NYtaxi['y'] > NYtaxi['upper'])
lower = sum(NYtaxi['y'] < NYtaxi['lower'])
a = print(f"{upper} on the upper side, {lower} in the lower side \nTotal {upper+lower} anomalies")
return a
# define a function to plot monthly devision line
def m_division():
for month in range(1, 12):
if month > 6:
plt.axvline(datetime(2014,month,1), color='k', linestyle='--', alpha=0.5)
if month < 3:
plt.axvline(datetime(2015,month,1), color='k', linestyle='--', alpha=0.5)
# Calculate rolling window mean
NYtaxi['SMA'] = NYtaxi['y'].rolling(window=freq).mean()
NYtaxi['diff'] = NYtaxi['y'] - NYtaxi['SMA']
# plot data distribution with SMA along the timeline
fig, axes = plt.subplots(2, 1, figsize=(20,8))
fig.suptitle('The distribution of diff')
sns.lineplot(ax=axes[0], data=NYtaxi[['y', 'SMA']])
sns.histplot(ax=axes[1], data=NYtaxi, x='diff')
plt.grid()
From the time series plot, we see our dataset repeats its pattern in a short-period way
Based on the histplot above, we notice this dataset is negatively skewed
# create a tolerance band
# we use 3 standard deviation to detect anomalies
NYtaxi['upper'] = NYtaxi['SMA'] + NYtaxi['diff'].quantile(0.975)
NYtaxi['lower'] = NYtaxi['SMA'] + NYtaxi['diff'].quantile(0.025)
# create a tolerance band
def plot_tb():
plt.figure(figsize=[20,8])
sns.scatterplot(data=NYtaxi['y'], s=10)
plt.fill_between(
np.arange(NYtaxi.shape[0]), NYtaxi['lower'], NYtaxi['upper'], alpha=0.4, color="grey",
label="Predicted interval")
plt.xlabel("Ordered samples.")
plt.ylabel("Values and prediction intervals.")
plt.show()
plot_tb()
Using the tolerance band, we see there are some anomalies
anomalies(NYtaxi)
257 on the upper side, 257 in the lower side Total 514 anomalies
from statsmodels.tsa.api import SimpleExpSmoothing
# build model
EMAfit = SimpleExpSmoothing(NYtaxi['y']).fit(smoothing_level=0.2,optimized=False)
EMA = EMAfit.forecast(3).rename(r'$\alpha=0.2$')
NYtaxi['EMA'] = EMAfit.predict(start = 0)
# create diff variable
NYtaxi['diff2'] = NYtaxi['y'] - NYtaxi['EMA']
# plot
fig, axes = plt.subplots(2,1,figsize=[20,8])
sns.lineplot(ax = axes[0], data=NYtaxi[['y','EMA']])
sns.histplot(ax = axes[1], data=NYtaxi, x='diff2')
plt.grid()
Under the ES method, data shape is close to a normal distribution
# plot a tolerance band
NYtaxi['upper'] = NYtaxi['EMA'] + NYtaxi['diff'].quantile(0.975)
NYtaxi['lower'] = NYtaxi['EMA'] + NYtaxi['diff'].quantile(0.025)
plot_tb()
Using tolerance band defined by the ES method, we see fewer anomalies
anomalies(NYtaxi)
118 on the upper side, 3 in the lower side Total 121 anomalies
import statsmodels.api as sm
# modity data format
NYtaxi = NYtaxi.reset_index(drop='index') #inplace=True)
NYtaxi.index = pd.to_datetime(NYtaxi['ds'])
# since data were recorded for every half hours, so for one day it is 24*2
result = sm.tsa.seasonal_decompose(NYtaxi['y'], freq= freq, model='additive')
# plot trend
plt.figure(figsize=[20,8])
sns.lineplot(data=result.trend)
plt.show()
Trend is relatively statble at the first half part, but become a little varying at the end
# plot seasonal pattern
plt.figure(figsize=[20,8])
# since this dataset does not really have a seasonality
# so I just plot its daliy trend for 10 periods
sns.lineplot(data=result.seasonal[0:freq*10])
plt.show()
It is hard to see any large trend if we look at all datapoints at once. So we only pull out 10 periods, it looks like the seasonality is steady
# plot residue
plt.figure(figsize=[20,8])
# also plot 10 periodical lengths
sns.lineplot(data=result.resid[1:freq*10])
plt.show()
Notes: When we add trend and residuel together, we have the predicted y values
estimated = result.trend + result.seasonal
# plot
plt.figure(figsize=[20,8])
plt.plot(NYtaxi.y, color='r')
plt.plot(estimated)
plt.title("Actual value and estimation")
Text(0.5, 1.0, 'Actual value and estimation')
# Detect outliers using three sigma method
resid = result.resid
resid_mu = resid.mean()
resid_dev = resid.std()
# 3 std
lower = resid_mu - 3*resid_dev
upper = resid_mu + 3*resid_dev
# plot
plt.figure(figsize=[20,8])
plt.plot(resid, color='purple')
# tolerance band
plt.fill_between([datetime(2014,7,1), datetime(2015,1,31)], lower, upper, color='y', alpha=0.25, linestyle='--', linewidth=2)
plt.xlim(datetime(2014,7,1), datetime(2015,1,31))
plt.show()
From this view, we can see clearly where are anomalies and how far are them from the mean
anomalies = NYtaxi[(resid < lower) | (resid > upper)]
n = anomalies.shape[0]
print(f"Using STD method, we detected {n} anomalies")
Using STD method, we detected 45 anomalies
# highlight anomalies in the plot
plt.figure(figsize=[20,8])
plt.plot(NYtaxi.y)
m_division() # call the function
plt.scatter(anomalies.index, anomalies.y, color='r', marker='D')
<matplotlib.collections.PathCollection at 0x1430b532190>
from prophet import Prophet
from prophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()
%matplotlib inline
# Fitting with default parameters
# we do not have a yearly seasonality
NYtaxi_model_0 = Prophet(daily_seasonality=True, yearly_seasonality=False, interval_width=0.95)
NYtaxi_model_0.fit(NYtaxi)
<prophet.forecaster.Prophet at 0x1430b54c760>
# predict futrue 30 days
future= NYtaxi_model_0.make_future_dataframe(periods=0, freq='d')
NYtaxi_model_0_data = NYtaxi_model_0.predict(future)
# plot change points
plt.figure(figsize=[15,8])
from prophet.plot import add_changepoints_to_plot
forecast = NYtaxi_model_0.predict(future)
fig= NYtaxi_model_0.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), NYtaxi_model_0, forecast)
<Figure size 1080x576 with 0 Axes>
The trendline seems predicting well
# plot components of the time series pattern
plots = NYtaxi_model_0.plot_components(NYtaxi_model_0_data)
# define model
NYtaxi_model_1 = Prophet( # n_changepoints=30, # this is optional
yearly_seasonality=False,
changepoint_prior_scale=0.5)
# culculate future prediction
forecast = NYtaxi_model_1.fit(NYtaxi[0:freq*84])
future1 = NYtaxi_model_1.make_future_dataframe(periods=21, freq='d')
NYtaxi_model_1_data = forecast.predict(future1)
# redefine futrue for a small period
fig= NYtaxi_model_1.plot(NYtaxi_model_1_data)
a = add_changepoints_to_plot(fig.gca(), NYtaxi_model_1, NYtaxi_model_1_data)
With a short period and more change points, the prediction looks fine
# plot components of the time series pattern
plots = NYtaxi_model_1.plot_components(NYtaxi_model_1_data)
# Diagnostics for the first model
from prophet.diagnostics import cross_validation
# By default, the initial training period is set to three times the horizon
# cutoffs are made every half a horizon.
NYtaxi_0_cv = cross_validation(NYtaxi_model_0,
initial='110 days',
period='30 days',
horizon = '10 days')
NYtaxi_0_cv.head()
INFO:prophet:Making 4 forecasts with cutoffs between 2014-10-23 23:30:00 and 2015-01-21 23:30:00
| ds | yhat | yhat_lower | yhat_upper | y | cutoff | |
|---|---|---|---|---|---|---|
| 0 | 2014-10-24 00:00:00 | 17985.969690 | 11392.046916 | 25475.457980 | 19061 | 2014-10-23 23:30:00 |
| 1 | 2014-10-24 00:30:00 | 16518.017981 | 9665.507922 | 23429.184971 | 16689 | 2014-10-23 23:30:00 |
| 2 | 2014-10-24 01:00:00 | 14876.985401 | 7922.606694 | 21311.992921 | 13006 | 2014-10-23 23:30:00 |
| 3 | 2014-10-24 01:30:00 | 13057.159047 | 6174.678890 | 19522.777664 | 9512 | 2014-10-23 23:30:00 |
| 4 | 2014-10-24 02:00:00 | 11116.283260 | 4051.407720 | 17532.476577 | 7745 | 2014-10-23 23:30:00 |
# Performance 0
from fbprophet.diagnostics import performance_metrics
NYtaxi_0_p = performance_metrics(NYtaxi_0_cv)
NYtaxi_0_p.head()
| horizon | mse | rmse | mae | mape | mdape | coverage | |
|---|---|---|---|---|---|---|---|
| 0 | 1 days 00:00:00 | 1.514466e+07 | 3891.614146 | 2976.231387 | 0.285119 | 0.158821 | 0.916667 |
| 1 | 1 days 00:30:00 | 1.499741e+07 | 3872.648536 | 2965.115833 | 0.285488 | 0.158821 | 0.916667 |
| 2 | 1 days 01:00:00 | 1.483121e+07 | 3851.131226 | 2962.173396 | 0.285959 | 0.160520 | 0.921875 |
| 3 | 1 days 01:30:00 | 1.458380e+07 | 3818.874092 | 2951.699249 | 0.286957 | 0.162781 | 0.927083 |
| 4 | 1 days 02:00:00 | 1.429550e+07 | 3780.939068 | 2932.613780 | 0.287329 | 0.162781 | 0.932292 |
I am not picky since this dataset is not a good example for time series analysis, but hope I can generate better predictions next time.
# How many anomalies are above tolerance band?
ha = sum(NYtaxi_0_cv['yhat_upper'] < NYtaxi_0_cv['y'])
la = sum(NYtaxi_0_cv['yhat_lower'] > NYtaxi_0_cv['y'])
print(f"Total {ha+la} anomalies, {ha} on the upper side, {la} on the lower side")
Total 304 anomalies, 66 on the upper side, 238 on the lower side
comparsion = {'Model': ['SMA', 'ES', 'STD', 'Prophet'],
'Total': [514, 121, 45, 303],
'Upper': [257, 128, '-', 68],
'lower': [257, 3, '-', 235]}
comparsion = pd.DataFrame(comparsion)
comparsion = comparsion.set_index(comparsion.Model.values).drop('Model', axis=1)
comparsion
| Total | Upper | lower | |
|---|---|---|---|
| SMA | 514 | 257 | 257 |
| ES | 121 | 128 | 3 |
| STD | 45 | - | - |
| Prophet | 303 | 68 | 235 |
Simple Moving Average (SMA)
I used the simple moving average technique to establish an underlying trend from data. When calculating the moving average, I set the window size as data points in a day (because the pattern, in this case, is a daily pattern). To detect anomalies, I compared the difference between the actual value and its corresponding moving average values, and then defined a tolerance band from the distribution of differences. Anomalies can be seen as data points that outside of the tolerance band, and they were visualized by plotting data distribution that covered with the tolerance band.
Exponential Smoothing Method (ES)
The exponential smoothing method assigns exponentially decreasing weights as the observation gets older, so now recent data have a larger impact on future prediction. In the practice, the model determines the optimal smoothing level by itself, I created a tolerance band that covers the central 95% of data from the distribution of forecasting residual, and I utilized it to decide what data points are anomalies.
Seasonal-Trend Decomposition Method (STD)
In this method, the time series signal is decomposed into three additive parts: seasonal, trend, and residual. I used the residual part (the difference between the actual value and the model prediction) to build a tolerance band (though this is not necessary) using the three-sigma rule. Anomalies are easily identified by the tolerance band, or the upper and lower ranges that are covered by the three sigmas.
The Facebook Prophet Model
Fbprophet follows the General Additive Model that adds more flexibilities to the STD method. In practice, prediction results from prophet contain an expected upper boundary variable and lower boundary variable, anomalies can be identified using these two variables.